{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import movie_reviews\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.collocations import BigramCollocationFinder\n",
    "from nltk.metrics import BigramAssocMeasures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    dataset = []\n",
    "    y_labels = []\n",
    "    for cat in movie_reviews.categories():\n",
    "        for fileid in movie_reviews.fileids(cat):\n",
    "            words = list(movie_reviews.words(fileid))\n",
    "            dataset.append((words, cat))\n",
    "            y_labels.append(cat)\n",
    "    return dataset, y_labels\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_train_test(input_dataset, ylabels):\n",
    "    train_size = 0.7\n",
    "    test_size = 1 - train_size\n",
    "    stratified_split = StratifiedShuffleSplit(test_size=test_size, n_splits=1, random_state=77)\n",
    "    \n",
    "    for train_indx, test_indx in stratified_split.split(input_dataset, ylabels):\n",
    "        train = [input_dataset[i] for i in train_indx]\n",
    "        train_y = [ylabels[i] for i in train_indx]\n",
    "        \n",
    "        test = [input_dataset[i] for i in test_indx]\n",
    "        test_y = [ylabels[i] for i in test_indx]\n",
    "    return train, test, train_y, test_y\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_word_features(instance):\n",
    "    feature_set = {}\n",
    "    words = instance[0]\n",
    "    for word in words:\n",
    "        feature_set[word] = 1\n",
    "        \n",
    "    return (feature_set, instance[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_negate_features(instance):\n",
    "    words = instance[0]\n",
    "    final_words = []\n",
    "    negate = False\n",
    "    negate_words = ['no', 'not']\n",
    "    for word in words:\n",
    "        if negate:\n",
    "            word = 'Not_' + word\n",
    "            negate = False\n",
    "        if word not in negate_words:\n",
    "            final_words.append(word)\n",
    "        else:\n",
    "            negate = True\n",
    "    \n",
    "    feature_set = {}\n",
    "    for word in final_words:\n",
    "        feature_set[word] = 1\n",
    "    return (feature_set, instance[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_stop_words(in_data):\n",
    "    stopword_list = stopwords.words('english')\n",
    "    negate_words = ['no', 'not']\n",
    "    new_stopwords = [word for word in stopword_list if word not in negate_words]\n",
    "    label = in_data[1]\n",
    "    words = [word for word in in_data[0] if word not in new_stopwords]\n",
    "    return (words, label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_keyphrase_features(instance):\n",
    "    feature_set = {}\n",
    "    instance = remove_stop_words(instance)\n",
    "    words = instance[0]\n",
    "    \n",
    "    bigram_finder = BigramCollocationFinder.from_words(words)\n",
    "    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 400)\n",
    "    for bigram in bigrams:\n",
    "        feature_set[bigram] = 1\n",
    "    return (feature_set, instance[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model(features):\n",
    "    model = nltk.NaiveBayesClassifier.train(features)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def probe_model(model, features, dataset_type='Train'):\n",
    "    accuracy = nltk.classify.accuracy(model, features)\n",
    "    print('\\n' + dataset_type + 'Accuracy = %.2f'%(accuracy*100) + '%')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_features(model, no_features=5):\n",
    "    print('\\nFeature Importance')\n",
    "    print('='*80 + '\\n')\n",
    "    print(model.show_most_informative_features(no_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model_cycle_1(train_data, dev_data):\n",
    "    train_features = map(build_word_features, train_data)\n",
    "    dev_features = map(build_word_features, dev_data)\n",
    "    model = build_model(train_features)\n",
    "    probe_model(model, train_features)\n",
    "    probe_model(model, dev_features, 'Dev')\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model_cycle_2(train_data, dev_data):\n",
    "    train_features = map(build_negate_features, train_data)\n",
    "    dev_features = map(build_negate_features, dev_data)\n",
    "    model = build_model(train_features)\n",
    "    probe_model(model, train_features)\n",
    "    probe_model(model, dev_features, 'Dev')\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model_cycle_3(train_data, dev_data):\n",
    "    train_features = map(build_keyphrase_features, train_data)\n",
    "    dev_features = map(build_keyphrase_features, dev_data)\n",
    "    model = build_model(train_features)\n",
    "    probe_model(model, train_features)\n",
    "    probe_model(model, dev_features, 'Dev')\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Original Data Size =  2000\n",
      "\n",
      "Training Data Size =  1399\n",
      "\n",
      "Dev Data Size =  420\n",
      "\n",
      "Test Data Size =  181\n",
      "\n",
      "TrainAccuracy = 0.00%\n",
      "\n",
      "DevAccuracy = 0.00%\n",
      "\n",
      "Feature Importance\n",
      "==============================\n",
      "\n",
      "Most Informative Features\n",
      "               stupidity = 1                 neg : pos    =     15.6 : 1.0\n",
      "                  warned = 1                 neg : pos    =     11.7 : 1.0\n",
      "             wonderfully = 1                 pos : neg    =     11.5 : 1.0\n",
      "             outstanding = 1                 pos : neg    =     11.0 : 1.0\n",
      "               ludicrous = 1                 neg : pos    =     11.0 : 1.0\n",
      "None\n",
      "\n",
      "TrainAccuracy = 0.00%\n",
      "\n",
      "DevAccuracy = 0.00%\n",
      "\n",
      "Feature Importance\n",
      "==============================\n",
      "\n",
      "Most Informative Features\n",
      "               stupidity = 1                 neg : pos    =     15.6 : 1.0\n",
      "             wonderfully = 1                 pos : neg    =     14.7 : 1.0\n",
      "               Not_funny = 1                 neg : pos    =     13.0 : 1.0\n",
      "                  warned = 1                 neg : pos    =     11.7 : 1.0\n",
      "             outstanding = 1                 pos : neg    =     11.0 : 1.0\n",
      "None\n",
      "\n",
      "TrainAccuracy = 0.00%\n",
      "\n",
      "DevAccuracy = 0.00%\n",
      "\n",
      "Feature Importance\n",
      "==============================\n",
      "\n",
      "Most Informative Features\n",
      "        ('one', 'worst') = 1                 neg : pos    =     13.0 : 1.0\n",
      "       ('waste', 'time') = 1                 neg : pos    =     13.0 : 1.0\n",
      "        ('.', 'cameron') = 1                 pos : neg    =     11.7 : 1.0\n",
      "        ('perfect', ',') = 1                 pos : neg    =     11.7 : 1.0\n",
      "          ('-', 'notch') = 1                 pos : neg    =     11.7 : 1.0\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "input_dataset, y_labels = get_data()\n",
    "train_data, all_test_data, train_y, all_test_y = get_train_test(input_dataset, y_labels)\n",
    "dev_data, test_data, dev_y, test_y = get_train_test(all_test_data, all_test_y)\n",
    "\n",
    "print('\\nOriginal Data Size = ', len(input_dataset))\n",
    "print('\\nTraining Data Size = ', len(train_data))\n",
    "print('\\nDev Data Size = ', len(dev_data))\n",
    "print('\\nTest Data Size = ', len(test_data))\n",
    "\n",
    "model_cycle_1 = build_model_cycle_1(train_data, dev_data)\n",
    "show_features(model_cycle_1)\n",
    "\n",
    "model_cycle_2 = build_model_cycle_2(train_data, dev_data)\n",
    "show_features(model_cycle_2)\n",
    "\n",
    "model_cycle_3 = build_model_cycle_3(train_data, dev_data)\n",
    "show_features(model_cycle_3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "nlp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
